{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6a16e9fb-fc9c-4480-9945-ad88d533dec5",
   "metadata": {},
   "source": [
    "#### Objective:- Create table for generating race migration points."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "56b869b5-0017-484c-b01f-29c64cfab4a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd, numpy as np\n",
    "import cudf, cupy as cp\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "783f4ca5-ba3c-4808-b24f-eb8ef50ec83b",
   "metadata": {
    "tags": []
   },
   "source": [
    "### DATA PREP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "97221830-a6f8-41c9-b26f-090b07582965",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>P_delta</th>\n",
       "      <th>points</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001001</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001002</td>\n",
       "      <td>1</td>\n",
       "      <td>-23</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001003</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001005</td>\n",
       "      <td>1</td>\n",
       "      <td>-8</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID20  STATE  P_delta  points\n",
       "0  10010201001000      1      -10      22\n",
       "1  10010201001001      1        4      24\n",
       "2  10010201001002      1      -23      23\n",
       "3  10010201001003      1        4       8\n",
       "4  10010201001005      1       -8       8"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df = cudf.read_csv('census_full/HI_DE_DC_mapped_data.csv').drop('Unnamed: 0',axis=1)\n",
    "df = cudf.read_csv('data/mapped_data_with_race.csv',usecols=['ID20','R'],dtype={'ID20':'int64','STATE':'int32','COUNTY':'str','points':'int32'})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2a5f9098-fb9f-4217-a568-37d8d8bd9a9c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6194258"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0e0b2120-fff4-46c1-9695-bc4e62ec88b7",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Generate random points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1315f522-090c-4231-84a3-d1c52efbfd59",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df[df.STATE==12].reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2fd2048e-0459-43a7-b83e-b148337a3f5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "150000"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "74a95a13-fd77-4844-8308-2ff516d6c4c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.iloc[:150000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "86c69b18-1157-4ec0-9c47-3bb032245ac9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>P_delta</th>\n",
       "      <th>points</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>933607</td>\n",
       "      <td>120010002011000</td>\n",
       "      <td>12</td>\n",
       "      <td>-1</td>\n",
       "      <td>29</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>933608</td>\n",
       "      <td>120010002011001</td>\n",
       "      <td>12</td>\n",
       "      <td>-4</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>933609</td>\n",
       "      <td>120010002011002</td>\n",
       "      <td>12</td>\n",
       "      <td>-23</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>933610</td>\n",
       "      <td>120010002011003</td>\n",
       "      <td>12</td>\n",
       "      <td>-7</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>933611</td>\n",
       "      <td>120010002011004</td>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    index             ID20  STATE  P_delta  points\n",
       "0  933607  120010002011000     12       -1      29\n",
       "1  933608  120010002011001     12       -4      12\n",
       "2  933609  120010002011002     12      -23      23\n",
       "3  933610  120010002011003     12       -7       7\n",
       "4  933611  120010002011004     12        2      16"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "e8fe7019-255e-4bc3-902f-752d179f668d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>ID20</th>\n",
       "      <th>STATE</th>\n",
       "      <th>P_delta</th>\n",
       "      <th>points</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>149995</th>\n",
       "      <td>1083602</td>\n",
       "      <td>120830010101019</td>\n",
       "      <td>12</td>\n",
       "      <td>17</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149996</th>\n",
       "      <td>1083603</td>\n",
       "      <td>120830010101020</td>\n",
       "      <td>12</td>\n",
       "      <td>0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149997</th>\n",
       "      <td>1083604</td>\n",
       "      <td>120830010101021</td>\n",
       "      <td>12</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149998</th>\n",
       "      <td>1083605</td>\n",
       "      <td>120830010101022</td>\n",
       "      <td>12</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149999</th>\n",
       "      <td>1083606</td>\n",
       "      <td>120830010101023</td>\n",
       "      <td>12</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          index             ID20  STATE  P_delta  points\n",
       "149995  1083602  120830010101019     12       17      17\n",
       "149996  1083603  120830010101020     12        0       8\n",
       "149997  1083604  120830010101021     12       10      10\n",
       "149998  1083605  120830010101022     12        6       6\n",
       "149999  1083606  120830010101023     12        2      14"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b3ae5ab3-264b-4e05-a2cc-360d897c1a2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def random_points_in_polygon(number, polygon):\n",
    "    # print(polygon)\n",
    "    points_x = np.array([])\n",
    "    points_y = np.array([])\n",
    "    min_x, min_y, max_x, max_y = polygon.bounds\n",
    "    i= 0\n",
    "    while i < number:\n",
    "        point_x = random.uniform(min_x, max_x)\n",
    "        point_y = random.uniform(min_y, max_y)\n",
    "        if polygon.contains(Point(point_x, point_y)):\n",
    "            points_x = np.append(points_x, point_x)\n",
    "            points_y = np.append(points_y, point_y)\n",
    "            i += 1\n",
    "    return points_x, points_y # returns list of points(lat), list of points(long)\n",
    "\n",
    "def generate_data(state, df_temp, gpdf):\n",
    "    t1 = datetime.datetime.now()\n",
    "    geoid_index_df = df_temp.index.to_numpy()\n",
    "    final_points_x = np.array([])\n",
    "    final_points_y = np.array([])\n",
    "    geoid = np.array([])\n",
    "    # f=0\n",
    "    for index, row in gpdf.iterrows():\n",
    "        # f+=1\n",
    "        points_x = np.array([])\n",
    "        points_y = np.array([])\n",
    "        geoid_temp = np.array([])\n",
    "        \n",
    "        if row['GEOID20'] in geoid_index_df and df_temp.loc[row['GEOID20']]>0:\n",
    "            num_points = df_temp.loc[row['GEOID20']]\n",
    "            polygon = row['geometry']\n",
    "            \n",
    "            if polygon is not None:\n",
    "                points_x, points_y = random_points_in_polygon(num_points, polygon)\n",
    "                # print(points_x,points_y)\n",
    "                geoid_temp = np.array([row['GEOID20']]*len(points_x))\n",
    "                geoid = np.append(geoid,geoid_temp)\n",
    "                final_points_x = np.append(final_points_x, points_x)\n",
    "                # print(final_points_x)\n",
    "                final_points_y = np.append(final_points_y, points_y)\n",
    "            print('Processing '+str(state)+' - Completed:', \"{0:0.2f}\".format((index/len(gpdf))*100), '%', end='')\n",
    "            print('', end='\\r')\n",
    "        \n",
    "        # if f==11:\n",
    "        #     break\n",
    "\n",
    "    print('Processing for '+str(state)+' complete \\n total time', datetime.datetime.now() - t1)\n",
    "    df_fin = cudf.DataFrame({'GEOID20': geoid,'x': final_points_x, 'y':final_points_y}) #,'COUNTY':county,'p_delta':p_delta,'p_net':p_net})\n",
    "    df_fin.GEOID20 = df_fin.GEOID20.astype('int').astype('str')\n",
    "    \n",
    "    df_fin.to_csv('data/migration_population_with_race/population_FL1_%s'%str(state)+'.csv', index=False)\n",
    "def exec_data(state_key_list):\n",
    "    c=0\n",
    "    for i in state_key_list:\n",
    "        # print(i)\n",
    "        c+=1\n",
    "        if i< 10:\n",
    "            i_str = '0'+str(i)\n",
    "        else:\n",
    "            i_str = str(i)\n",
    "        # path = 'census_2020_data/nhgis0003_shape/nhgis0003_shapefile_tl2020_%s0_block_2020/%s_block_2020.shp'%(i_str,states[i])\n",
    "        path ='data/tl_shapefiles/tl_2021_%s_tabblock20.shp'%(i_str)\n",
    "        #print(path)\n",
    "        print(\"started reading shape file for state \", states[i])\n",
    "        if os.path.isfile(path):    \n",
    "            gpdf = gpd.read_file(path)[['GEOID20', 'geometry']].sort_values('GEOID20').reset_index(drop=True)\n",
    "            gpdf.GEOID20 = gpdf.GEOID20.astype('int64')\n",
    "            gpdf = gpdf[gpdf.GEOID20<=120830010101023]\n",
    "            print(\"completed reading shape file for state \", states[i])\n",
    "            df_temp = df.query('STATE == @i')[['ID20', 'points']]\n",
    "            df_temp.index = df_temp.ID20\n",
    "            df_temp = df_temp['points']\n",
    "            # print(gpdf.head(3))\n",
    "            # print(df_temp)\n",
    "            print(\"starting to generate data for \"+str(states[i])+\"... \")\n",
    "            generate_data(states[i], df_temp, gpdf)\n",
    "            del(df_temp)\n",
    "        else:\n",
    "            print(\"shape file does not exist\")\n",
    "            continue\n",
    "        # if c==2:\n",
    "        #     break "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "e87b2dc6-83a4-4f6e-b3cd-70be279546b0",
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "started reading shape file for state  FL\n",
      "completed reading shape file for state  FL\n",
      "starting to generate data for FL... \n",
      "Processing for FL complete 100.00 %78.32 %\n",
      " total time 3:57:34.135479\n"
     ]
    }
   ],
   "source": [
    "# states = {1 :\"AL\",2 :\"AK\",4 :\"AZ\",5 :\"AR\",6 :\"CA\",8 :\"CO\",9 :\"CT\",10:\"DE\",11:\"DC\",12:\"FL\",13:\"GA\",15:\"HI\",\n",
    "#           16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS\",21:\"KY\",22:\"LA\",23:\"ME\",24:\"MD\",25:\"MA\",26:\"MI\",27:\"MN\",\n",
    "#           28:\"MS\",29:\"MO\",30:\"MT\",31:\"NE\",32:\"NV\",33:\"NH\",34:\"NJ\",35:\"NM\",36:\"NY\",37:\"NC\",38:\"ND\",39:\"OH\",\n",
    "#           40:\"OK\",41:\"OR\",42:\"PA\",44:\"RI\",45:\"SC\",46:\"SD\",47:\"TN\",48:\"TX\",49:\"UT\",50:\"VT\",51:\"VA\",53:\"WA\",\n",
    "#           54:\"WV\",55:\"WI\",56:\"WY\",72:\"PR\"}\n",
    "\n",
    "states = {12:\"FL\"} \n",
    "exec_data(states.keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "152ebc2b-458b-4137-856a-14b510d1866c",
   "metadata": {
    "tags": []
   },
   "source": [
    "#### Concat divided states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "7a62c842-48df-4aab-b92e-193ca8e5ff9f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# part1 = cudf.read_csv('data/migration_population_with_race/population_FL1_FL.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})\n",
    "# part2 = cudf.read_csv('data/migration_population_with_race/population_FL2_FL.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})\n",
    "# part3 = cudf.read_csv('data/migration_population_with_race/population_FL3_FL.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})\n",
    "# # part4 = cudf.read_csv('data/migration_population_with_race/population_NY4_NY.csv',dtype={'ID20':'int64','x':'float32','y':'float32'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "7bdf0112-51e7-4301-bb86-ef14f6c7dd3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fl_df = cudf.concat([part1,part2,part3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3d918deb-8bd4-4c4e-8150-df8a994a1491",
   "metadata": {},
   "outputs": [],
   "source": [
    "# df[df.STATE==12]['points'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "6a70ea02-8d45-4aba-81dc-2cfc9b4c8b54",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fl_df.to_csv('data/migration_population_with_race/population_FL.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1eae95ae-10b4-4fce-a535-a1f7a5273d2d",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Concat States"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "73036a5d-6e9d-41ff-b600-9f0f8c49725f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_shape_and_states(state_key_list):\n",
    "    concat_states = cudf.DataFrame()\n",
    "    \n",
    "    for i in state_key_list:\n",
    "        if i< 10:\n",
    "            i_str = '0'+str(i)\n",
    "        else:\n",
    "            i_str = str(i)\n",
    "        path = 'data/migration_population_with_race//population_%s'%str(states[i])+'.csv'\n",
    "        if os.path.isfile(path):    \n",
    "            temp = cudf.read_csv(path,dtype={'ID20':'int64','x':'float32','y':'float32'})# Load shape files\n",
    "            concat_states = cudf.concat([concat_states,temp])\n",
    "        else:\n",
    "            print(i,states[i])\n",
    "            print(\"shape file does not exist\")\n",
    "            continue\n",
    "    return concat_states"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b0bc7c43-4677-4af0-b603-0f3feb814225",
   "metadata": {},
   "outputs": [],
   "source": [
    "states = {1 :\"AL\",2 :\"AK\",4 :\"AZ\",5 :\"AR\",6 :\"CA\",8 :\"CO\",9 :\"CT\",10:\"DE\",11:\"DC\",12:\"FL\",13:\"GA\",15:\"HI\",\n",
    "          16:\"ID\",17:\"IL\",18:\"IN\",19:\"IA\",20:\"KS\",21:\"KY\",22:\"LA\",23:\"ME\",24:\"MD\",25:\"MA\",26:\"MI\",27:\"MN\",\n",
    "          28:\"MS\",29:\"MO\",30:\"MT\",31:\"NE\",32:\"NV\",33:\"NH\",34:\"NJ\",35:\"NM\",36:\"NY\",37:\"NC\",38:\"ND\",39:\"OH\",\n",
    "          40:\"OK\",41:\"OR\",42:\"PA\",44:\"RI\",45:\"SC\",46:\"SD\",47:\"TN\",48:\"TX\",49:\"UT\",50:\"VT\",51:\"VA\",53:\"WA\",\n",
    "          54:\"WV\",55:\"WI\",56:\"WY\",72:\"PR\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "06b16a05-f241-42f2-8415-a6e350bc1818",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86.480591</td>\n",
       "      <td>32.469173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86.478142</td>\n",
       "      <td>32.470337</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86.478485</td>\n",
       "      <td>32.471489</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86.479645</td>\n",
       "      <td>32.469475</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86.479912</td>\n",
       "      <td>32.471939</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             ID20          x          y\n",
       "0  10010201001000 -86.480591  32.469173\n",
       "1  10010201001000 -86.478142  32.470337\n",
       "2  10010201001000 -86.478485  32.471489\n",
       "3  10010201001000 -86.479645  32.469475\n",
       "4  10010201001000 -86.479912  32.471939"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indv_df = merge_shape_and_states(states.keys()).drop('Unnamed: 0',axis=1)\n",
    "indv_df.rename(columns={'GEOID20':'ID20'},inplace=True)\n",
    "indv_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a542ba34-24aa-4fe4-ac3a-977a1fe07bc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# cpu_df = indv_df.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "28b1ad64-e36b-4c3c-b420-5ea8c5c117c2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# cpu_df.to_csv('data/final_data_with_race.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "806ef1fe-db74-4c49-aa5e-ce7a1b032028",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "182532663"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cpu_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a15baa0e-3a99-4afe-8f60-ead679ec4586",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>STATE</th>\n",
       "      <th>P_delta</th>\n",
       "      <th>points</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4496</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4497</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4498</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4499</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4500</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                ID20   x  y  STATE  P_delta  points\n",
       "4496  10010201001000 -86 32      1      -10      22\n",
       "4497  10010201001000 -86 32      1      -10      22\n",
       "4498  10010201001000 -86 32      1      -10      22\n",
       "4499  10010201001000 -86 32      1      -10      22\n",
       "4500  10010201001000 -86 32      1      -10      22"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset = indv_df.merge(df,on='ID20',how='left').sort_values('ID20')\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "71c76736-31ba-4ef3-ad77-35e5f3062422",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset['P_net'] = dataset['P_delta'].apply(lambda x: -1 if x < 0 else ( 1 if x>0 else 0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "72c9ae33-25a5-48e5-8d1a-79097a6e63f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID20</th>\n",
       "      <th>x</th>\n",
       "      <th>y</th>\n",
       "      <th>STATE</th>\n",
       "      <th>P_delta</th>\n",
       "      <th>points</th>\n",
       "      <th>P_net</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>4496</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4497</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4498</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4499</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4500</th>\n",
       "      <td>10010201001000</td>\n",
       "      <td>-86</td>\n",
       "      <td>32</td>\n",
       "      <td>1</td>\n",
       "      <td>-10</td>\n",
       "      <td>22</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                ID20   x  y  STATE  P_delta  points  P_net\n",
       "4496  10010201001000 -86 32      1      -10      22     -1\n",
       "4497  10010201001000 -86 32      1      -10      22     -1\n",
       "4498  10010201001000 -86 32      1      -10      22     -1\n",
       "4499  10010201001000 -86 32      1      -10      22     -1\n",
       "4500  10010201001000 -86 32      1      -10      22     -1"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cdf = dataset.to_pandas()\n",
    "cdf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b6cd9ea1-fde1-4c19-a501-1b7eca2f7b5f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# cdf.to_csv('data/final_data_with_race.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
